<- read.csv("kakaodata_1206.csv", fileEncoding = "ISO-8859-1") kakao
-
데이터 source: http://data.krx.co.kr/contents/MDC/MDI/mdiLoader/index.cmd?menuId=MDC0201020203
참고 https://velog.io/@isitcake_yes/mlarimastockprediction
head(kakao)
ÀÏÀÚ | Á... | X.ëºñ | µî.ô.ü | X.Ã.. | X.í.. | Àú.. | X.Å.... | X.Å...ë.Ý | X.Ã..ÃÑ.. | X.óÀåÁÖ.Ä.ö | |
---|---|---|---|---|---|---|---|---|---|---|---|
<chr> | <int> | <int> | <dbl> | <int> | <int> | <int> | <int> | <dbl> | <dbl> | <int> | |
1 | 2023/12/06 | 50900 | 100 | 0.20 | 50700 | 51100 | 50200 | 735503 | 37302792200 | 2.262484e+13 | 444495970 |
2 | 2023/12/05 | 50800 | 0 | 0.00 | 50700 | 51200 | 50200 | 1142447 | 57992282100 | 2.258040e+13 | 444495970 |
3 | 2023/12/04 | 50800 | 1100 | 2.21 | 49800 | 51300 | 49750 | 1785113 | 90624610300 | 2.258040e+13 | 444495970 |
4 | 2023/12/01 | 49700 | -800 | -1.58 | 50400 | 50400 | 49650 | 1105367 | 55174771850 | 2.209145e+13 | 444495970 |
5 | 2023/11/30 | 50500 | 0 | 0.00 | 50200 | 50900 | 50000 | 1613598 | 81271347200 | 2.244705e+13 | 444495970 |
6 | 2023/11/29 | 50500 | -600 | -1.17 | 50800 | 51300 | 50100 | 1284120 | 65153113600 | 2.244705e+13 | 444495970 |
colnames(kakao) <- c("일자","종가","대비","등락률","시가","고가","저가","거래량","거래대금","시가총액","상장주식수")
<- kakao[order(kakao$일자),]
kakao rownames(kakao) <- NULL
head(kakao)
일자 | 종가 | 대비 | 등락률 | 시가 | 고가 | 저가 | 거래량 | 거래대금 | 시가총액 | 상장주식수 | |
---|---|---|---|---|---|---|---|---|---|---|---|
<chr> | <int> | <int> | <dbl> | <int> | <int> | <int> | <int> | <dbl> | <dbl> | <int> | |
1 | 2023/01/02 | 52700 | -400 | -0.75 | 53600 | 53800 | 52400 | 887667 | 46979376500 | 2.347313e+13 | 445410387 |
2 | 2023/01/03 | 53300 | 600 | 1.14 | 52400 | 53500 | 51400 | 1420569 | 74588286800 | 2.374037e+13 | 445410387 |
3 | 2023/01/04 | 55700 | 2400 | 4.50 | 53200 | 56000 | 53100 | 2241411 | 123346180300 | 2.480936e+13 | 445410387 |
4 | 2023/01/05 | 57700 | 2000 | 3.59 | 55800 | 58200 | 55700 | 3046064 | 175103778900 | 2.570018e+13 | 445410387 |
5 | 2023/01/06 | 57200 | -500 | -0.87 | 57200 | 58000 | 56500 | 1420345 | 81326211100 | 2.547747e+13 | 445410387 |
6 | 2023/01/09 | 61100 | 3900 | 6.82 | 58700 | 61200 | 58300 | 3482961 | 208443993900 | 2.721457e+13 | 445410387 |
- 시계열 데이터 분석을 하기 위해, 일단.. 일자와 종가만 선택하자.
<- kakao$종가 data
<- ts(data)
data head(data)
A Time Series:
- 52700
- 53300
- 55700
- 57700
- 57200
- 61100
해당 데이터는 주식 데이터이기 때문에 중간에 공휴일로 인해 일자가 NA값이 있으므로 ts객체를 통해 그냥 해부림..
데이터가.. 아주 그냥 별로다
::tsdisplay(data) forecast
주식데이터가 평균이 일정하지 않은 비정상성의 특징을 보인다.
ARIMA의 d차수가 1이상일 것이다.
계절성분이나 다른 특징은 보이지 않는다.
plot.ts(z)
mean(z)
53663.0434782609
par(mfrow=c(1,2))
acf(z, lag.max=60)
pacf(z, lag.max=60)
ACF가 천천히 줄어들고.. pacr가 첫번째 시차만 살아있고 나머지는 0이다.
시도표를 확인했을 때 계절성분은 없어보이고.. 추세는 조금 있어 보인다.
AR(1)모형..?
::ggtsdisplay(z,
forecastsmooth=T)
`geom_smooth()` using formula = 'y ~ x'
- ACF가 천천히 감소한다. 확률적 추세가 있어보인다.
::adfTest(z, lags = 1, type = "c")
fUnitRoots::adfTest(z, lags = 30, type = "c")
fUnitRoots::adfTest(z, lags = 60, type = "c") fUnitRoots
Title:
Augmented Dickey-Fuller Test
Test Results:
PARAMETER:
Lag Order: 1
STATISTIC:
Dickey-Fuller: -1.1594
P VALUE:
0.6251
Description:
Wed Dec 6 16:27:38 2023 by user:
Title:
Augmented Dickey-Fuller Test
Test Results:
PARAMETER:
Lag Order: 30
STATISTIC:
Dickey-Fuller: -1.2253
P VALUE:
0.6005
Description:
Wed Dec 6 16:27:38 2023 by user:
Title:
Augmented Dickey-Fuller Test
Test Results:
PARAMETER:
Lag Order: 60
STATISTIC:
Dickey-Fuller: -0.2159
P VALUE:
0.9282
Description:
Wed Dec 6 16:27:38 2023 by user:
- 유의확률이 0.05보다 크므로 귀무가설을 기각할 수 없다. 즉 차분이 필요하다.
::ggtsdisplay(diff(z),
forecastsmooth=T)
`geom_smooth()` using formula = 'y ~ x'
0을 중심으로 움직인다.
pacf가…감소하는 거 같은데………………………
acf가..그림이 잇앙해.ㅐ.
## mean : H0 : mu = 0
t.test(lag_z)
One Sample t-test
data: lag_z
t = -0.10283, df = 228, p-value = 0.9182
alternative hypothesis: true mean is not equal to 0
95 percent confidence interval:
-158.4750 142.7545
sample estimates:
mean of x
-7.860262
- p-value = 0.9182로 평균이 0이다.
-
차분한 모형(평균f취함)
<- arima(lag_z, order=c(0,0,1), include.mean = F)
fit1 fit1
Call:
arima(x = lag_z, order = c(0, 0, 1), include.mean = F)
Coefficients:
ma1
-0.0137
s.e. 0.0646
sigma^2 estimated as 1331943: log likelihood = -1939.63, aic = 3883.27
-
원 모형
<- arima(z, order=c(0,1,1))
fit fit
Call:
arima(x = z, order = c(0, 1, 1))
Coefficients:
ma1
-0.0137
s.e. 0.0646
sigma^2 estimated as 1331943: log likelihood = -1939.63, aic = 3883.27
\(Z_t = ε_t -0.0137ε_t, \hat θ = 0.0137\)
예측
<- forecast::forecast(fit)
fore_fit fore_fit
Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
Mar 20 50898.63 49419.59 52377.66 48636.64 53160.62
Apr 20 50898.63 48821.22 52976.03 47721.51 54075.75
May 20 50898.63 48360.19 53437.07 47016.42 54780.84
Jun 20 50898.63 47970.87 53826.38 46421.01 55376.24
Jul 20 50898.63 47627.57 54169.68 45895.98 55901.28
Aug 20 50898.63 47317.03 54480.23 45421.04 56376.22
Sep 20 50898.63 47031.34 54765.92 44984.12 56813.14
Oct 20 50898.63 46765.35 55031.91 44577.32 57219.94
Nov 20 50898.63 46515.47 55281.79 44195.17 57602.09
Dec 20 50898.63 46279.09 55518.17 43833.65 57963.60
Jan 21 50898.63 46054.23 55743.03 43489.76 58307.50
Feb 21 50898.63 45839.35 55957.90 43161.13 58636.12
Mar 21 50898.63 45633.24 56164.02 42845.91 58951.35
Apr 21 50898.63 45434.89 56362.36 42542.57 59254.69
May 21 50898.63 45243.50 56553.76 42249.86 59547.40
Jun 21 50898.63 45058.38 56738.88 41966.73 59830.52
Jul 21 50898.63 44878.94 56918.31 41692.32 60104.94
Aug 21 50898.63 44704.71 57092.55 41425.84 60371.41
Sep 21 50898.63 44535.24 57262.02 41166.66 60630.59
Oct 21 50898.63 44370.17 57427.09 40914.21 60883.05
Nov 21 50898.63 44209.17 57588.09 40667.98 61129.27
Dec 21 50898.63 44051.96 57745.30 40427.55 61369.71
Jan 22 50898.63 43898.27 57898.99 40192.51 61604.75
Feb 22 50898.63 43747.89 58049.37 39962.52 61834.74
plot(fore_fit)
::checkresiduals(fit) forecast
Ljung-Box test
data: Residuals from ARIMA(0,1,1)
Q* = 23.178, df = 23, p-value = 0.4504
Model df: 1. Total lags used: 24
- 5번째 시차 뭐냐
= resid(fit)
resid ::tsdisplay(resid) forecast
# 잔차의 포트맨토 검정 ## H0 : rho1=...=rho_k=0
::LjungBox(fit, lags=c(6,12,18,24)) portes
lags | statistic | df | p-value | |
---|---|---|---|---|
6 | 8.728782 | 5 | 0.1203836 | |
12 | 12.770020 | 11 | 0.3086222 | |
18 | 17.077177 | 17 | 0.4491493 | |
24 | 23.178420 | 23 | 0.4503932 |
## 정규성검정
::jarque.bera.test(resid) ##JB test H0: normal tseries
Jarque Bera Test
data: resid
X-squared = 23.774, df = 2, p-value = 6.878e-06
- 정규분포가 아니넹.
par(mfrow=c(1,2))
hist(resid)
qqnorm(resid, pch=16)
qqline(resid)
## 잔차 검정
::sarima(z, p=0, d=1, q=1) astsa
initial value 7.051150
iter 2 value 7.051051
iter 3 value 7.051051
iter 3 value 7.051051
iter 3 value 7.051051
final value 7.051051
converged
initial value 7.051051
iter 1 value 7.051051
final value 7.051051
converged
$fit
Call:
arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D, Q), period = S),
xreg = constant, transform.pars = trans, fixed = fixed, optim.control = list(trace = trc,
REPORT = 1, reltol = tol))
Coefficients:
ma1 constant
-0.0138 -7.9038
s.e. 0.0646 75.2190
sigma^2 estimated as 1331879: log likelihood = -1939.63, aic = 3885.26
$degrees_of_freedom
[1] 227
$ttable
Estimate SE t.value p.value
ma1 -0.0138 0.0646 -0.2130 0.8315
constant -7.9038 75.2190 -0.1051 0.9164
$AIC
[1] 16.96618
$AICc
[1] 16.96641
$BIC
[1] 17.01116
<- forecast::forecast(fit, 25)
fore_fit fore_fit
Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
Mar 20 50898.63 49419.59 52377.66 48636.64 53160.62
Apr 20 50898.63 48821.22 52976.03 47721.51 54075.75
May 20 50898.63 48360.19 53437.07 47016.42 54780.84
Jun 20 50898.63 47970.87 53826.38 46421.01 55376.24
Jul 20 50898.63 47627.57 54169.68 45895.98 55901.28
Aug 20 50898.63 47317.03 54480.23 45421.04 56376.22
Sep 20 50898.63 47031.34 54765.92 44984.12 56813.14
Oct 20 50898.63 46765.35 55031.91 44577.32 57219.94
Nov 20 50898.63 46515.47 55281.79 44195.17 57602.09
Dec 20 50898.63 46279.09 55518.17 43833.65 57963.60
Jan 21 50898.63 46054.23 55743.03 43489.76 58307.50
Feb 21 50898.63 45839.35 55957.90 43161.13 58636.12
Mar 21 50898.63 45633.24 56164.02 42845.91 58951.35
Apr 21 50898.63 45434.89 56362.36 42542.57 59254.69
May 21 50898.63 45243.50 56553.76 42249.86 59547.40
Jun 21 50898.63 45058.38 56738.88 41966.73 59830.52
Jul 21 50898.63 44878.94 56918.31 41692.32 60104.94
Aug 21 50898.63 44704.71 57092.55 41425.84 60371.41
Sep 21 50898.63 44535.24 57262.02 41166.66 60630.59
Oct 21 50898.63 44370.17 57427.09 40914.21 60883.05
Nov 21 50898.63 44209.17 57588.09 40667.98 61129.27
Dec 21 50898.63 44051.96 57745.30 40427.55 61369.71
Jan 22 50898.63 43898.27 57898.99 40192.51 61604.75
Feb 22 50898.63 43747.89 58049.37 39962.52 61834.74
Mar 22 50898.63 43600.61 58196.65 39737.27 62059.99
plot(fore_fit)
::sarima.for(z, 25, 0,1,1) astsa
- $pred
-
A Time Series: 3 × 12 Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec 20 50890.61 50882.70 50874.80 50866.90 50858.99 50851.09 50843.18 50835.28 50827.38 50819.47 21 50811.57 50803.67 50795.76 50787.86 50779.95 50772.05 50764.15 50756.24 50748.34 50740.43 50732.53 50724.63 22 50716.72 50708.82 50700.92 A Time Series: 3 × 12 Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec 20 1154.070 1620.915 1980.620 2284.369 2552.221 2794.516 3017.417 3224.949 3419.910 3604.341 21 3779.783 3947.436 4108.252 4263.006 4412.336 4556.775 4696.774 4832.719 4964.943 5093.737 5219.352 5342.015 22 5461.924 5579.257 5694.172
::sarima.for(lag_z, 25, 0,0,1) astsa
- $pred
-
A Time Series: 3 × 12 Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec 20 -9.392518 -7.903833 -7.903833 -7.903833 -7.903833 -7.903833 -7.903833 -7.903833 -7.903833 -7.903833 21 -7.903833 -7.903833 -7.903833 -7.903833 -7.903833 -7.903833 -7.903833 -7.903833 -7.903833 -7.903833 -7.903833 -7.903833 22 -7.903833 -7.903833 -7.903833 A Time Series: 3 × 12 Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec 20 1154.07 1154.18 1154.18 1154.18 1154.18 1154.18 1154.18 1154.18 1154.18 1154.18 21 1154.18 1154.18 1154.18 1154.18 1154.18 1154.18 1154.18 1154.18 1154.18 1154.18 1154.18 1154.18 22 1154.18 1154.18 1154.18
8:2
<- window(z, start = start(z), end = index(z)[184])
train_data <- window(z, start = index(z)[185]) test_data
length(train_data)
184
length(test_data)
46
test_data
Jan | Feb | Mar | Apr | May | Jun | Jul | Aug | Sep | Oct | Nov | Dec | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
16 | 43950 | 41600 | 40850 | 42050 | 41400 | 42650 | 43650 | 43200 | ||||
17 | 43150 | 42800 | 41800 | 40500 | 39050 | 37950 | 39600 | 38950 | 37650 | 37750 | 38000 | 37800 |
18 | 37600 | 38550 | 41300 | 44700 | 43750 | 43950 | 45600 | 45650 | 45000 | 46350 | 48300 | 48200 |
19 | 47500 | 47800 | 49250 | 50300 | 50300 | 50500 | 49850 | 51100 | 50500 | 50500 | 49700 | 50800 |
20 | 50800 | 50900 |
::sarima.for(train_data, 46, 0,1,1) astsa
- $pred
-
A Time Series: 5 × 12 Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec 16 43765.61 43716.78 43667.94 43619.11 43570.28 43521.44 43472.61 43423.78 17 43374.95 43326.11 43277.28 43228.45 43179.61 43130.78 43081.95 43033.11 42984.28 42935.45 42886.61 42837.78 18 42788.95 42740.11 42691.28 42642.45 42593.61 42544.78 42495.95 42447.12 42398.28 42349.45 42300.62 42251.78 19 42202.95 42154.12 42105.28 42056.45 42007.62 41958.78 41909.95 41861.12 41812.28 41763.45 41714.62 41665.79 20 41616.95 41568.12 A Time Series: 5 × 12 Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec 16 1154.524 1581.835 1916.110 2200.172 2451.537 2679.423 2889.392 3085.103 17 3269.119 3443.314 3609.112 3767.621 3919.725 4066.143 4207.469 4344.200 4476.756 4605.499 4730.740 4852.749 18 4971.765 5087.998 5201.635 5312.841 5421.767 5528.547 5633.304 5736.147 5837.179 5936.492 6034.171 6130.293 19 6224.931 6318.152 6410.017 6500.584 6589.907 6678.035 6765.015 6850.891 6935.703 7019.491 7102.291 7184.136 20 7265.059 7345.091
<- arima(train_data, order = c(0, 1, 1))
arima_model
# 훈련 세트 예측
<- predict(arima_model, n.ahead = length(train_data))
train_pred
# 테스트 세트 예측
<- predict(arima_model, n.ahead = length(test_data))
test_pred
# 시각화
plot(train_data, col = "blue", type = "l", lty = 1, lwd = 2, main = "ARIMA Model Forecast")
lines(train_pred$pred, col = "red", lty = 2, lwd = 2)
lines(test_pred$pred, col = "green", lty = 2, lwd = 2)
legend("topright", legend = c("Original", "Train Prediction", "Test Prediction"), col = c("blue", "red", "green"), lty = c(1, 2, 2), lwd = c(2, 2, 2))
# ARIMA 모델 적합
<- arima(train_data, order = c(0, 1, 1))
arima_model
# 훈련 세트 예측
<- predict(arima_model, n.ahead = length(train_data))$pred + diff(train_data)[1]
train_pred
# 테스트 세트 예측
<- predict(arima_model, n.ahead = length(test_data))$pred + diff(train_data)[length(train_data)]
test_pred
# 시각화
plot(data, col = "blue", type = "l", lty = 1, lwd = 2, main = "ARIMA Model Forecast")
lines(train_pred, col = "red", lty = 2, lwd = 2)
lines(c(rep(NA, length(train_data)), test_pred), col = "green", lty = 2, lwd = 2)
legend("topright", legend = c("Original", "Train Prediction", "Test Prediction"), col = c("blue", "red", "green"), lty = c(1, 2, 2), lwd = c(2, 2, 2))
- index가 안맞아서 위와같이 되네;; 흠..
<- kakao$종가 data
length(data)
230
<- data[1:184]
tr <- data[185:length(data)] ts
::sarima.for(tr, 46, 0,1,1) astsa
- $pred
- A Time Series:
- 43765.6102424124
- 43716.777101872
- 43667.9439613316
- 43619.1108207913
- 43570.2776802509
- 43521.4445397106
- 43472.6113991702
- 43423.7782586298
- 43374.9451180894
- 43326.1119775491
- 43277.2788370087
- 43228.4456964683
- 43179.612555928
- 43130.7794153876
- 43081.9462748472
- 43033.1131343069
- 42984.2799937665
- 42935.4468532261
- 42886.6137126858
- 42837.7805721454
- 42788.947431605
- 42740.1142910647
- 42691.2811505243
- 42642.4480099839
- 42593.6148694436
- 42544.7817289032
- 42495.9485883628
- 42447.1154478225
- 42398.2823072821
- 42349.4491667417
- 42300.6160262014
- 42251.782885661
- 42202.9497451206
- 42154.1166045803
- 42105.2834640399
- 42056.4503234995
- 42007.6171829592
- 41958.7840424188
- 41909.9509018784
- 41861.1177613381
- 41812.2846207977
- 41763.4514802573
- 41714.618339717
- 41665.7851991766
- 41616.9520586362
- 41568.1189180958
- $se
- A Time Series:
- 1154.52422170161
- 1581.83516172709
- 1916.11032543969
- 2200.17160231584
- 2451.53653450596
- 2679.4230125902
- 2889.39162814223
- 3085.10296119252
- 3269.11862457442
- 3443.31422933193
- 3609.11196588582
- 3767.62066597484
- 3919.72470756061
- 4066.1429000272
- 4207.46888090952
- 4344.19965979955
- 4476.75630167457
- 4605.49924383485
- 4730.73985602082
- 4852.74931206263
- 4971.76549991095
- 5087.99847547984
- 5201.6348186763
- 5312.84115019884
- 5421.76699864782
- 5528.54715888424
- 5633.30364780627
- 5736.14733848146
- 5837.1793350169
- 5936.49213673604
- 6034.17062983052
- 6130.2929367437
- 6224.93114746422
- 6318.15195219699
- 6410.01719119622
- 6500.58433464343
- 6589.90690315061
- 6678.03483762794
- 6765.0148257748
- 6850.89059125406
- 6935.70315063396
- 7019.49104238294
- 7102.29053154507
- 7184.13579318036
- 7265.05907720203
- 7345.09085686597